Write a short description about the course and add a link to your github repository here. This is an R markdown (.Rmd) file so you can use R markdown syntax. See the ‘Useful links’ page in the mooc area (chapter 1) for instructions.
This is my Open Data Science course book,containing all the exercise solutions and also some random projects for extra practice.
analysis_data <- read.csv("~/Documents/GitHub/IODS-project/data/learning2014.csv",
sep=",", header=TRUE)
str(analysis_data)
## 'data.frame': 166 obs. of 7 variables:
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
## $ age : int 53 55 49 53 49 38 50 37 37 42 ...
## $ attitude: int 37 31 25 35 37 38 35 29 38 21 ...
## $ deep : num 0.97 0.789 0.947 0.947 0.992 ...
## $ stra : num 0.314 0.256 0.307 0.307 0.322 ...
## $ surf : num 0.925 1.134 0.806 0.806 1.015 ...
## $ points : int 25 12 24 10 22 21 21 31 24 26 ...
dim(analysis_data)
## [1] 166 7
summary(analysis_data$gender)
## F M
## 110 56
summary(analysis_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 21.00 22.00 25.51 27.00 55.00
summary(analysis_data$attitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 26.00 32.00 31.43 37.00 50.00
summary(analysis_data$deep)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4284 0.9019 0.9921 0.9956 1.1049 1.3303
summary(analysis_data$stra)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1389 0.2923 0.3216 0.3227 0.3581 0.4312
summary(analysis_data$surf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5670 0.8655 1.0147 0.9981 1.1341 1.5519
summary(analysis_data$points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 19.00 23.00 22.72 27.75 33.00
library(GGally)
## Loading required package: ggplot2
library(ggplot2)
ggpairs(analysis_data, mapping = aes( col = gender, alpha = 0.2 ), lower = list(combo = wrap("facethist", bins = 20)))
my_model <- lm(points ~ attitude + age + stra, data = analysis_data)
summary(my_model)
##
## Call:
## lm(formula = points ~ attitude + age + stra, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.206 -3.430 0.204 3.979 10.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.60773 3.38966 4.605 8.32e-06 ***
## attitude 0.35941 0.05696 6.310 2.56e-09 ***
## age -0.07716 0.05322 -1.450 0.149
## stra -6.87318 8.55576 -0.803 0.423
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.307 on 162 degrees of freedom
## Multiple R-squared: 0.2043, Adjusted R-squared: 0.1896
## F-statistic: 13.87 on 3 and 162 DF, p-value: 4.305e-08
my_model2 <- lm(points ~ attitude, data = analysis_data)
summary(my_model2)
##
## Call:
## lm(formula = points ~ attitude, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9763 -3.2119 0.4339 4.1534 10.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.63715 1.83035 6.358 1.95e-09 ***
## attitude 0.35255 0.05674 6.214 4.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1856
## F-statistic: 38.61 on 1 and 164 DF, p-value: 4.119e-09
par(mfrow = c(2,2))
plot(my_model, which = c(1, 2, 5))
analysis_data_new <- analysis_data[-c(2, 4, 56), ]
my_model3 <- lm(points ~ attitude + age + stra, data = analysis_data_new)
summary(my_model3)
##
## Call:
## lm(formula = points ~ attitude + age + stra, data = analysis_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.8988 -3.4558 0.0807 3.8415 11.5969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.30158 3.21710 4.445 1.64e-05 ***
## attitude 0.38062 0.05399 7.049 5.18e-11 ***
## age 0.04040 0.05658 0.714 0.476
## stra -13.31399 8.20912 -1.622 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.01 on 159 degrees of freedom
## Multiple R-squared: 0.2415, Adjusted R-squared: 0.2271
## F-statistic: 16.87 on 3 and 159 DF, p-value: 1.455e-09
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("stringr")
library("tidyr")
library("GGally")
library("ggplot2")
library("MASS")
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
analysis.data <- read.csv("~/Documents/GitHub/IODS-project/data/alc.csv",
sep=",", header=TRUE)
glimpse(analysis.data)
## Observations: 382
## Variables: 35
## $ school <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,...
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, ...
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15,...
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, G...
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, ...
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, ...
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, ...
## $ Mjob <fct> at_home, at_home, at_home, health, other, services,...
## $ Fjob <fct> teacher, other, other, services, other, other, othe...
## $ reason <fct> course, course, other, home, home, reputation, home...
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes,...
## $ guardian <fct> mother, father, mother, mother, father, mother, mot...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, ...
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, ...
## $ failures <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, ...
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes,...
## $ paid <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, ...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes,...
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no...
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, ...
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, ...
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, ...
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, ...
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, ...
## $ absences <int> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, ...
## $ G1 <int> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11,...
## $ G2 <int> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11...
## $ G3 <int> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 1...
## $ alc_use <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1...
## $ high_use <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FAL...
gather(analysis.data) %>% glimpse
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 13,370
## Variables: 2
## $ key <chr> "school", "school", "school", "school", "school", "schoo...
## $ value <chr> "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "G...
gather(analysis.data) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
library("dplyr")
chosen.data <- analysis.data[, c("age", "sex", "Pstatus", "famrel", "high_use")]
# Draw a bar plot of chosen variables
gather(chosen.data) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
# Draw a correlation plot of chosen variables
ggpairs(chosen.data, mapping = aes( col = sex, alpha = 0.2 ), lower = list(combo = wrap("facethist", bins = 20)))
# Get cross tabulation tables for variables
library(gmodels)
CrossTable(chosen.data$sex, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$sex | FALSE | TRUE | Row Total |
## ----------------|-----------|-----------|-----------|
## F | 156 | 42 | 198 |
## | 2.102 | 4.942 | |
## | 0.788 | 0.212 | 0.518 |
## | 0.582 | 0.368 | |
## | 0.408 | 0.110 | |
## ----------------|-----------|-----------|-----------|
## M | 112 | 72 | 184 |
## | 2.262 | 5.318 | |
## | 0.609 | 0.391 | 0.482 |
## | 0.418 | 0.632 | |
## | 0.293 | 0.188 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## ----------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$age, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$age | FALSE | TRUE | Row Total |
## ----------------|-----------|-----------|-----------|
## 15 | 63 | 18 | 81 |
## | 0.671 | 1.576 | |
## | 0.778 | 0.222 | 0.212 |
## | 0.235 | 0.158 | |
## | 0.165 | 0.047 | |
## ----------------|-----------|-----------|-----------|
## 16 | 79 | 28 | 107 |
## | 0.206 | 0.484 | |
## | 0.738 | 0.262 | 0.280 |
## | 0.295 | 0.246 | |
## | 0.207 | 0.073 | |
## ----------------|-----------|-----------|-----------|
## 17 | 64 | 36 | 100 |
## | 0.540 | 1.270 | |
## | 0.640 | 0.360 | 0.262 |
## | 0.239 | 0.316 | |
## | 0.168 | 0.094 | |
## ----------------|-----------|-----------|-----------|
## 18 | 54 | 27 | 81 |
## | 0.141 | 0.331 | |
## | 0.667 | 0.333 | 0.212 |
## | 0.201 | 0.237 | |
## | 0.141 | 0.071 | |
## ----------------|-----------|-----------|-----------|
## 19 | 7 | 4 | 11 |
## | 0.067 | 0.157 | |
## | 0.636 | 0.364 | 0.029 |
## | 0.026 | 0.035 | |
## | 0.018 | 0.010 | |
## ----------------|-----------|-----------|-----------|
## 20 | 1 | 0 | 1 |
## | 0.127 | 0.298 | |
## | 1.000 | 0.000 | 0.003 |
## | 0.004 | 0.000 | |
## | 0.003 | 0.000 | |
## ----------------|-----------|-----------|-----------|
## 22 | 0 | 1 | 1 |
## | 0.702 | 1.649 | |
## | 0.000 | 1.000 | 0.003 |
## | 0.000 | 0.009 | |
## | 0.000 | 0.003 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## ----------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$Pstatus, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$Pstatus | FALSE | TRUE | Row Total |
## --------------------|-----------|-----------|-----------|
## A | 26 | 12 | 38 |
## | 0.016 | 0.038 | |
## | 0.684 | 0.316 | 0.099 |
## | 0.097 | 0.105 | |
## | 0.068 | 0.031 | |
## --------------------|-----------|-----------|-----------|
## T | 242 | 102 | 344 |
## | 0.002 | 0.004 | |
## | 0.703 | 0.297 | 0.901 |
## | 0.903 | 0.895 | |
## | 0.634 | 0.267 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## --------------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$famrel, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$famrel | FALSE | TRUE | Row Total |
## -------------------|-----------|-----------|-----------|
## 1 | 6 | 2 | 8 |
## | 0.027 | 0.063 | |
## | 0.750 | 0.250 | 0.021 |
## | 0.022 | 0.018 | |
## | 0.016 | 0.005 | |
## -------------------|-----------|-----------|-----------|
## 2 | 10 | 9 | 19 |
## | 0.832 | 1.955 | |
## | 0.526 | 0.474 | 0.050 |
## | 0.037 | 0.079 | |
## | 0.026 | 0.024 | |
## -------------------|-----------|-----------|-----------|
## 3 | 39 | 25 | 64 |
## | 0.775 | 1.823 | |
## | 0.609 | 0.391 | 0.168 |
## | 0.146 | 0.219 | |
## | 0.102 | 0.065 | |
## -------------------|-----------|-----------|-----------|
## 4 | 135 | 54 | 189 |
## | 0.044 | 0.102 | |
## | 0.714 | 0.286 | 0.495 |
## | 0.504 | 0.474 | |
## | 0.353 | 0.141 | |
## -------------------|-----------|-----------|-----------|
## 5 | 78 | 24 | 102 |
## | 0.580 | 1.362 | |
## | 0.765 | 0.235 | 0.267 |
## | 0.291 | 0.211 | |
## | 0.204 | 0.063 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## -------------------|-----------|-----------|-----------|
##
##
# Model with glm()
chosen.mod <- glm(high_use ~ Pstatus + age + sex + famrel, data = chosen.data, family = "binomial")
summary(chosen.mod)
##
## Call:
## glm(formula = high_use ~ Pstatus + age + sex + famrel, family = "binomial",
## data = chosen.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5439 -0.8479 -0.6640 1.2254 2.0504
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.88778 1.71500 -2.267 0.0234 *
## PstatusT -0.13249 0.38409 -0.345 0.7301
## age 0.23479 0.09865 2.380 0.0173 *
## sexM 0.94515 0.23601 4.005 6.21e-05 ***
## famrel -0.32119 0.12560 -2.557 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 438.92 on 377 degrees of freedom
## AIC: 448.92
##
## Number of Fisher Scoring iterations: 4
# Compute odds ratios (OR)
chosen.mod.OR <- coef(chosen.mod) %>% exp
# Compute confidence intervals (CI)
confint(chosen.mod)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -7.29460682 -0.54975516
## PstatusT -0.86780384 0.64983706
## age 0.04291218 0.43089580
## sexM 0.48770904 1.41454481
## famrel -0.56968043 -0.07549712
chosen.mod.CI <- confint(chosen.mod) %>% exp
## Waiting for profiling to be done...
# print out the odds ratios with their confidence intervals
cbind(chosen.mod.OR, chosen.mod.CI)
## chosen.mod.OR 2.5 % 97.5 %
## (Intercept) 0.02049085 0.0006791919 0.5770911
## PstatusT 0.87590845 0.4198726450 1.9152287
## age 1.26464541 1.0438462211 1.5386352
## sexM 2.57320663 1.6285809238 4.1146131
## famrel 0.72528788 0.5657061903 0.9272824
# predict() the probability of high_use
probabilities <- predict(chosen.mod, type = "response")
# add the predicted probabilities to 'alc'
chosen.data <- mutate(chosen.data, probability = probabilities)
# use the probabilities to make a prediction of high_use
chosen.data <- mutate(chosen.data, prediction = probabilities > 0.5)
# tabulate the target variable versus the predictions
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 98 16
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g <- ggplot(chosen.data, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g + geom_point()
# tabulate the target variable versus the predictions
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 98 16
# Adjust the code: Use %>% to apply the prop.table() function on the output of table()
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction) %>% prop.table()
## prediction
## high_use FALSE TRUE
## FALSE 0.67801047 0.02356021
## TRUE 0.25654450 0.04188482
#Adjust the code: Use %>% to apply the addmargins() function on the output of prop.table()
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.67801047 0.02356021 0.70157068
## TRUE 0.25654450 0.04188482 0.29842932
## Sum 0.93455497 0.06544503 1.00000000
# define a loss function (mean prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
# call loss_func to compute the average number of wrong predictions in the (training) data using the above function
loss_func(class = chosen.data$high_use, prob = chosen.data$probability)
## [1] 0.2801047
# Calculate missclassification error using an R package to confirm the above results
library(InformationValue)
optCutOff <- optimalCutoff(chosen.data$high_use, chosen.data$prediction)[1]
misClassError(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.2801
# Calculating concordance
Concordance(chosen.data$high_use, chosen.data$prediction)
## $Concordance
## [1] 0.1356376
##
## $Discordance
## [1] 0.8643624
##
## $Tied
## [1] -1.110223e-16
##
## $Pairs
## [1] 30552
# Calculating sensitivity/specificty of our model
sensitivity(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.1403509
specificity(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.9664179
# Construct an ROC curve
library(plotROC)
plotROC(chosen.data$high_use, chosen.data$prediction)
## K-fold cross-validation
library(boot)
K <- nrow(chosen.data) #defines the leave-one-out method
cv_chosen <- cv.glm(data = chosen.data, cost = loss_func, glmfit = chosen.mod, K = 10)
## average number of wrong predictions in the cross validation
cv_chosen$delta[1]
## [1] 0.2827225
What we see above is that the average predicton error from the above model is 0.31 which is much higher than 0.26. Hence, the model explored in Datacamp is better for predictive power of alcohol use than this one explored here.
## Define big model to compare.
big.model <- glm(high_use ~ school + sex + age + address + famsize + Pstatus + G3 + failures + famrel + famsup + freetime + goout + schoolsup + absences + health + Medu + Fedu +
activities + paid, data = analysis.data, family = binomial(link = "logit"))
## Define null model to compare.
null.model <- glm(high_use ~ 1, data = analysis.data, family = binomial(link = "logit"))
## Determining model with step procedure
step.model <- step(null.model, scope = list(upper = big.model), direction = "both",
test = "Chisq", data = analysis.data)
## Start: AIC=467.68
## high_use ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + goout 1 415.68 419.68 50.001 1.537e-12 ***
## + absences 1 447.45 451.45 18.233 1.955e-05 ***
## + sex 1 450.95 454.95 14.732 0.0001239 ***
## + freetime 1 456.84 460.84 8.840 0.0029469 **
## + failures 1 456.98 460.98 8.698 0.0031864 **
## + G3 1 459.43 463.43 6.252 0.0124059 *
## + age 1 460.83 464.83 4.851 0.0276320 *
## + famrel 1 460.92 464.92 4.756 0.0292035 *
## + address 1 462.30 466.30 3.375 0.0661994 .
## + famsize 1 463.48 467.48 2.200 0.1380308
## <none> 465.68 467.68
## + health 1 464.29 468.29 1.389 0.2385700
## + activities 1 464.43 468.43 1.245 0.2645257
## + schoolsup 1 464.51 468.51 1.165 0.2803902
## + paid 1 464.80 468.80 0.877 0.3491564
## + school 1 465.13 469.13 0.553 0.4572172
## + famsup 1 465.19 469.19 0.485 0.4860956
## + Fedu 1 465.46 469.46 0.215 0.6427752
## + Pstatus 1 465.62 469.62 0.060 0.8062405
## + Medu 1 465.63 469.63 0.046 0.8298479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=419.68
## high_use ~ goout
##
## Df Deviance AIC LRT Pr(>Chi)
## + absences 1 402.50 408.50 13.175 0.0002837 ***
## + sex 1 402.74 408.74 12.935 0.0003224 ***
## + famrel 1 408.12 414.12 7.562 0.0059609 **
## + address 1 409.72 415.72 5.960 0.0146306 *
## + failures 1 410.92 416.92 4.753 0.0292404 *
## + G3 1 413.39 419.39 2.293 0.1299925
## + health 1 413.42 419.42 2.263 0.1324990
## + famsize 1 413.51 419.51 2.166 0.1411078
## <none> 415.68 419.68
## + activities 1 413.68 419.68 2.000 0.1573155
## + age 1 414.38 420.38 1.299 0.2543646
## + paid 1 414.53 420.53 1.147 0.2840893
## + freetime 1 414.75 420.75 0.931 0.3345969
## + schoolsup 1 414.77 420.77 0.910 0.3402438
## + school 1 414.79 420.79 0.886 0.3464949
## + famsup 1 415.36 421.36 0.314 0.5753984
## + Pstatus 1 415.54 421.54 0.142 0.7065896
## + Fedu 1 415.57 421.57 0.111 0.7387151
## + Medu 1 415.64 421.64 0.035 0.8522261
## - goout 1 465.68 467.68 50.001 1.537e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=408.5
## high_use ~ goout + absences
##
## Df Deviance AIC LRT Pr(>Chi)
## + sex 1 387.75 395.75 14.748 0.0001229 ***
## + famrel 1 395.79 403.79 6.716 0.0095572 **
## + address 1 397.09 405.09 5.413 0.0199833 *
## + failures 1 398.40 406.40 4.107 0.0427168 *
## + health 1 400.08 408.08 2.427 0.1192936
## + activities 1 400.24 408.24 2.262 0.1325869
## <none> 402.50 408.50
## + famsize 1 400.54 408.54 1.962 0.1613260
## + G3 1 400.92 408.92 1.583 0.2083813
## + paid 1 400.92 408.92 1.582 0.2084805
## + school 1 400.98 408.98 1.526 0.2166950
## + freetime 1 401.21 409.21 1.288 0.2563846
## + schoolsup 1 401.50 409.50 1.003 0.3164808
## + age 1 401.95 409.95 0.550 0.4581523
## + famsup 1 402.06 410.06 0.446 0.5044225
## + Medu 1 402.25 410.25 0.254 0.6144158
## + Fedu 1 402.47 410.47 0.035 0.8512142
## + Pstatus 1 402.50 410.50 0.006 0.9380858
## - absences 1 415.68 419.68 13.175 0.0002837 ***
## - goout 1 447.45 451.45 44.943 2.028e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=395.75
## high_use ~ goout + absences + sex
##
## Df Deviance AIC LRT Pr(>Chi)
## + famrel 1 379.81 389.81 7.946 0.0048195 **
## + address 1 382.70 392.70 5.058 0.0245093 *
## + activities 1 383.70 393.70 4.051 0.0441470 *
## + paid 1 384.50 394.50 3.255 0.0711919 .
## + failures 1 385.06 395.06 2.693 0.1008032
## <none> 387.75 395.75
## + school 1 385.95 395.95 1.803 0.1793763
## + G3 1 386.44 396.44 1.312 0.2520448
## + famsize 1 386.58 396.58 1.176 0.2781516
## + health 1 386.77 396.77 0.985 0.3209508
## + Medu 1 387.02 397.02 0.731 0.3926563
## + age 1 387.09 397.09 0.663 0.4156185
## + freetime 1 387.47 397.47 0.280 0.5964214
## + schoolsup 1 387.61 397.61 0.143 0.7050365
## + famsup 1 387.74 397.74 0.013 0.9095974
## + Fedu 1 387.75 397.75 0.000 0.9944014
## + Pstatus 1 387.75 397.75 0.000 0.9949057
## - sex 1 402.50 408.50 14.748 0.0001229 ***
## - absences 1 402.74 408.74 14.988 0.0001082 ***
## - goout 1 430.07 436.07 42.314 7.773e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=389.81
## high_use ~ goout + absences + sex + famrel
##
## Df Deviance AIC LRT Pr(>Chi)
## + address 1 374.76 386.76 5.044 0.0247052 *
## + activities 1 376.23 388.23 3.581 0.0584394 .
## + paid 1 376.64 388.64 3.168 0.0750934 .
## + failures 1 377.79 389.79 2.020 0.1552693
## <none> 379.81 389.81
## + health 1 377.91 389.91 1.904 0.1676831
## + school 1 378.55 390.55 1.261 0.2615349
## + G3 1 378.81 390.81 1.000 0.3172697
## + famsize 1 378.89 390.89 0.921 0.3372367
## + age 1 378.93 390.93 0.883 0.3473595
## + Medu 1 378.97 390.97 0.835 0.3607934
## + freetime 1 379.07 391.07 0.738 0.3902071
## + schoolsup 1 379.69 391.69 0.122 0.7271839
## + famsup 1 379.77 391.77 0.042 0.8367076
## + Pstatus 1 379.80 391.80 0.013 0.9084113
## + Fedu 1 379.80 391.80 0.005 0.9461156
## - famrel 1 387.75 395.75 7.946 0.0048195 **
## - absences 1 393.80 401.80 13.993 0.0001835 ***
## - sex 1 395.79 403.79 15.979 6.406e-05 ***
## - goout 1 424.86 432.86 45.048 1.923e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=386.76
## high_use ~ goout + absences + sex + famrel + address
##
## Df Deviance AIC LRT Pr(>Chi)
## + activities 1 370.67 384.67 4.094 0.0430315 *
## + paid 1 370.92 384.92 3.844 0.0499390 *
## <none> 374.76 386.76
## + health 1 373.08 387.08 1.685 0.1942380
## + failures 1 373.09 387.09 1.670 0.1962137
## + famsize 1 373.41 387.41 1.353 0.2447313
## + freetime 1 373.99 387.99 0.773 0.3793315
## + G3 1 374.37 388.37 0.392 0.5311653
## + Medu 1 374.46 388.46 0.301 0.5835271
## + age 1 374.47 388.47 0.292 0.5889660
## + school 1 374.49 388.49 0.275 0.6000857
## + schoolsup 1 374.70 388.70 0.063 0.8020037
## + famsup 1 374.73 388.73 0.032 0.8586738
## + Fedu 1 374.74 388.74 0.025 0.8739745
## + Pstatus 1 374.76 388.76 0.000 0.9993591
## - address 1 379.81 389.81 5.044 0.0247052 *
## - famrel 1 382.70 392.70 7.932 0.0048564 **
## - absences 1 388.50 398.50 13.738 0.0002101 ***
## - sex 1 390.30 400.30 15.539 8.084e-05 ***
## - goout 1 422.01 432.01 47.242 6.273e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=384.67
## high_use ~ goout + absences + sex + famrel + address + activities
##
## Df Deviance AIC LRT Pr(>Chi)
## + paid 1 366.49 382.49 4.185 0.0407904 *
## <none> 370.67 384.67
## + health 1 369.12 385.12 1.547 0.2135619
## + failures 1 369.46 385.46 1.214 0.2704852
## + famsize 1 369.53 385.53 1.143 0.2850968
## + freetime 1 369.78 385.78 0.888 0.3460988
## + age 1 370.48 386.48 0.191 0.6621013
## + Fedu 1 370.52 386.52 0.153 0.6960310
## + G3 1 370.54 386.54 0.134 0.7142250
## + school 1 370.55 386.55 0.124 0.7246465
## + Medu 1 370.56 386.56 0.108 0.7421703
## + Pstatus 1 370.61 386.61 0.063 0.8024291
## + famsup 1 370.65 386.65 0.023 0.8805015
## + schoolsup 1 370.65 386.65 0.019 0.8899358
## - activities 1 374.76 386.76 4.094 0.0430315 *
## - address 1 376.23 388.23 5.557 0.0184020 *
## - famrel 1 378.14 390.14 7.466 0.0062860 **
## - absences 1 384.72 396.72 14.051 0.0001779 ***
## - sex 1 387.95 399.95 17.275 3.234e-05 ***
## - goout 1 419.34 431.34 48.665 3.037e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=382.49
## high_use ~ goout + absences + sex + famrel + address + activities +
## paid
##
## Df Deviance AIC LRT Pr(>Chi)
## + failures 1 364.29 382.29 2.198 0.1381945
## <none> 366.49 382.49
## + health 1 364.50 382.50 1.983 0.1590665
## + famsize 1 365.31 383.31 1.180 0.2772746
## + freetime 1 365.43 383.43 1.051 0.3052950
## + famsup 1 366.10 384.10 0.381 0.5368105
## + G3 1 366.11 384.11 0.372 0.5416643
## + Medu 1 366.12 384.12 0.367 0.5444822
## + age 1 366.33 384.33 0.156 0.6931483
## + school 1 366.42 384.42 0.068 0.7936917
## + Fedu 1 366.42 384.42 0.061 0.8047582
## + Pstatus 1 366.47 384.47 0.011 0.9173748
## + schoolsup 1 366.48 384.48 0.003 0.9594304
## - paid 1 370.67 384.67 4.185 0.0407904 *
## - activities 1 370.92 384.92 4.435 0.0352018 *
## - address 1 372.90 386.90 6.409 0.0113517 *
## - famrel 1 374.01 388.01 7.523 0.0060923 **
## - absences 1 381.53 395.53 15.046 0.0001049 ***
## - sex 1 386.10 400.10 19.612 9.487e-06 ***
## - goout 1 415.98 429.98 49.499 1.985e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=382.29
## high_use ~ goout + absences + sex + famrel + address + activities +
## paid + failures
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 364.29 382.29
## - failures 1 366.49 382.49 2.198 0.1381945
## + health 1 362.70 382.70 1.585 0.2080158
## + famsize 1 363.01 383.01 1.278 0.2582321
## + freetime 1 363.30 383.30 0.983 0.3214915
## + Fedu 1 363.80 383.80 0.484 0.4866195
## + famsup 1 363.87 383.87 0.415 0.5195099
## - activities 1 368.10 384.10 3.817 0.0507355 .
## + school 1 364.20 384.20 0.087 0.7678936
## + Medu 1 364.24 384.24 0.049 0.8248779
## + age 1 364.25 384.25 0.034 0.8532051
## + G3 1 364.27 384.27 0.020 0.8881974
## + schoolsup 1 364.28 384.28 0.005 0.9437839
## + Pstatus 1 364.29 384.29 0.000 0.9839522
## - paid 1 369.46 385.46 5.168 0.0230019 *
## - address 1 370.26 386.26 5.975 0.0145064 *
## - famrel 1 371.11 387.11 6.826 0.0089854 **
## - absences 1 378.63 394.63 14.345 0.0001522 ***
## - sex 1 382.40 398.40 18.113 2.081e-05 ***
## - goout 1 409.81 425.81 45.518 1.512e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final.model.coeff <- as.data.frame(step.model$coefficients)
final.mod <- glm(high_use ~ sex + address + failures + famrel + goout + absences +
activities + paid, data = analysis.data, family = binomial(link = "logit"))
summary(final.mod)
##
## Call:
## glm(formula = high_use ~ sex + address + failures + famrel +
## goout + absences + activities + paid, family = binomial(link = "logit"),
## data = analysis.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8829 -0.7374 -0.4707 0.6885 2.8657
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.49941 0.73483 -3.401 0.000671 ***
## sexM 1.13462 0.27425 4.137 3.52e-05 ***
## addressU -0.77509 0.31595 -2.453 0.014157 *
## failures 0.32272 0.21815 1.479 0.139051
## famrel -0.37983 0.14566 -2.608 0.009119 **
## goout 0.80068 0.12880 6.217 5.08e-10 ***
## absences 0.08401 0.02218 3.788 0.000152 ***
## activitiesyes -0.51728 0.26676 -1.939 0.052487 .
## paidyes 0.61261 0.27244 2.249 0.024536 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 364.29 on 373 degrees of freedom
## AIC: 382.29
##
## Number of Fisher Scoring iterations: 5
# Get final model data
final.data <- analysis.data[, c("goout", "absences", "sex", "famrel", "address",
"activities", "paid", "failures", "high_use")]
# predict() the probability of high_use
probabilities.final.mod <- predict(final.mod, type = "response")
# add the predicted probabilities to 'alc'
final.data <- mutate(final.data, probability = probabilities.final.mod)
# use the probabilities to make a prediction of high_use
final.data <- mutate(final.data, prediction = probabilities.final.mod > 0.5)
# tabulate the target variable versus the predictions
table(high_use = final.data$high_use, prediction = final.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 252 16
## TRUE 59 55
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g_final <- ggplot(final.data, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g_final + geom_point()
# tabulate the target variable versus the predictions
table(high_use = final.data$high_use, prediction = final.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 252 16
## TRUE 59 55
# Adjust the code: Use %>% to apply the prop.table() function on the output of table()
table(high_use = final.data$high_use, prediction = final.data$prediction) %>% prop.table()
## prediction
## high_use FALSE TRUE
## FALSE 0.65968586 0.04188482
## TRUE 0.15445026 0.14397906
#Adjust the code: Use %>% to apply the addmargins() function on the output of prop.table()
table(high_use = final.data$high_use, prediction = final.data$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.65968586 0.04188482 0.70157068
## TRUE 0.15445026 0.14397906 0.29842932
## Sum 0.81413613 0.18586387 1.00000000
# define a loss function (mean prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
# call loss_func to compute the average number of wrong predictions in the (training) data using the above function
loss_func(class = final.data$high_use, prob = final.data$probability)
## [1] 0.1963351
# Calculate missclassification error using an R package to confirm the above results
library(InformationValue)
optCutOff_final <- optimalCutoff(final.data$high_use, final.data$prediction)[1]
misClassError(final.data$high_use, final.data$prediction, threshold = optCutOff_final)
## [1] 0.1963
# Calculating concordance
Concordance(final.data$high_use, final.data$prediction)
## $Concordance
## [1] 0.4536528
##
## $Discordance
## [1] 0.5463472
##
## $Tied
## [1] 0
##
## $Pairs
## [1] 30552
# Calculating sensitivity/specificty of our model
sensitivity(final.data$high_use, final.data$prediction, threshold = optCutOff)
## [1] 0.4824561
specificity(final.data$high_use, final.data$prediction, threshold = optCutOff)
## [1] 0.9402985
# Construct an ROC curve
library(plotROC)
# ROC for Final chosen model
plotROC(final.data$high_use, final.data$prediction)
# ROC for initially chosen model
plotROC(chosen.data$high_use, chosen.data$prediction)
## K-fold cross-validation
library(boot)
K_final <- nrow(final.data) #defines the leave-one-out method
cv_final <- cv.glm(data = final.data, cost = loss_func, glmfit = final.mod, K = 10)
## average number of wrong predictions in the cross validation
cv_final$delta[1]
## [1] 0.2120419
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
library("stringr")
library("tidyr")
library("GGally")
library("ggplot2")
library("MASS")
library("corrplot")
## corrplot 0.84 loaded
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data("Boston")
glimpse(Boston)
## Observations: 506
## Variables: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, ...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5,...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, ...
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0,...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605...
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, ...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, ...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.9...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, ...
plot(Boston$med)
# General summary of the dataset
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
# Matrix of the variables
pairs(Boston)
# Correlation matrix
cor_matrix <- cor(Boston) %>% round(2)
corrplot(cor_matrix, method = "circle", type = "upper",
cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
# How do other variables stack up against crime rates? Do we see patterns?
molten <- melt(Boston, id = "crim")
ggplot(molten, aes(x = value, y = crim))+
facet_wrap( ~ variable, scales = "free")+
geom_point()
# Centering and standardizing variables
boston_scaled <- scale(Boston)
# Summaries of the scaled variables
glimpse(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
# Class of boston_scaled object
class(boston_scaled)
## [1] "matrix"
# Converting to data frame
boston_scaled <- as.data.frame(boston_scaled)
# Summary of the scaled crime rate
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
# Quantile vector of 'crim'
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
# Categorical variable 'crime' from 'crim'
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, label = c("low", "med_low", "med_high", "high"))
# Tabulation of the new factor crime
table(crime)
## crime
## low med_low med_high high
## 127 126 126 127
# Removing original 'crim' from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
# Adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
# Number of rows in the Boston dataset
n <- nrow(boston_scaled)
n
## [1] 506
# Choosing randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
ind
## [1] 417 395 406 357 27 405 15 79 176 121 88 344 267 327 171 296 444
## [18] 31 12 113 499 180 157 308 240 105 270 58 292 430 428 342 279 298
## [35] 273 51 429 498 123 75 35 376 48 170 303 329 20 311 271 287 167
## [52] 92 414 341 259 161 109 421 411 101 404 148 260 319 250 416 141 307
## [69] 166 155 490 89 442 206 323 266 215 95 446 476 356 160 28 7 209
## [86] 464 470 284 131 262 55 288 222 53 289 505 265 354 385 56 481 258
## [103] 415 68 336 159 275 34 96 234 389 491 282 364 231 502 210 358 436
## [120] 169 253 500 78 193 104 97 9 239 309 462 108 252 353 156 205 433
## [137] 218 504 485 199 117 72 83 93 361 237 198 371 456 268 80 274 483
## [154] 316 407 480 493 106 408 100 71 122 345 16 321 254 277 276 325 331
## [171] 151 369 91 249 460 126 299 401 60 478 22 346 278 6 66 195 165
## [188] 135 49 246 463 174 426 367 153 439 143 69 394 26 200 226 373 477
## [205] 19 489 182 116 359 133 115 247 294 283 306 39 33 355 50 76 348
## [222] 238 310 136 37 120 64 241 110 219 381 454 360 492 52 350 370 29
## [239] 479 418 248 326 351 145 235 8 420 185 257 474 132 322 213 178 17
## [256] 173 98 318 225 349 74 375 501 396 365 191 398 229 242 362 445 211
## [273] 335 190 84 147 301 214 207 62 484 352 13 67 21 388 312 114 194
## [290] 188 302 441 217 5 154 41 256 337 251 368 269 43 497 150 383 402
## [307] 77 87 343 86 324 467 372 94 125 447 128 432 142 399 47 162 152
## [324] 431 65 384 187 332 423 73 380 468 413 138 486 221 146 315 469 112
## [341] 59 290 255 378 18 377 232 451 410 449 220 45 184 427 54 305 42
## [358] 425 347 189 400 25 202 10 46 243 118 320 107 127 465 186 224 412
## [375] 494 450 40 506 379 472 119 457 129 455 263 374 70 458 124 297 392
## [392] 366 488 201 419 227 11 334 99 386 363 422 330 163
# Training set
train <- boston_scaled[ind,]
# Test set
test <- boston_scaled[-ind,]
# Saving correct classes from test data
correct_classes <- test$crime
# Removing 'crime' variable from test data
test <- dplyr::select(test, -crime)
# Linear discriminant analysis
lda.fit <- lda(crime ~., data = train)
# lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2376238 0.2673267 0.2425743 0.2524752
##
## Group means:
## zn indus chas nox rm
## low 1.0014004 -0.8912636 -0.067271764 -0.8909202 0.3897645
## med_low -0.1015445 -0.2534451 0.019307987 -0.5643776 -0.1466938
## med_high -0.3717344 0.1796268 0.209764839 0.4350017 0.1036770
## high -0.4872402 1.0149946 -0.002135914 1.0325378 -0.4278221
## age dis rad tax ptratio
## low -0.9119635 0.9386323 -0.6827912 -0.7183610 -0.40961012
## med_low -0.3017652 0.3539023 -0.5554496 -0.4555328 -0.09237093
## med_high 0.3980875 -0.3727657 -0.4287320 -0.3259889 -0.33107442
## high 0.7954768 -0.8697800 1.6596029 1.5294129 0.80577843
## black lstat medv
## low 0.37145812 -0.746364398 0.48447156
## med_low 0.34509817 -0.100956085 -0.03125288
## med_high 0.04556271 0.009259188 0.17372924
## high -0.84638067 0.875326487 -0.64144898
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.10674185 0.738605037 -0.96179206
## indus 0.01791507 -0.139816596 0.32632494
## chas -0.08488627 -0.006290236 0.11343741
## nox 0.28464468 -0.862021831 -1.41860923
## rm -0.12959933 -0.106299897 -0.13901959
## age 0.22417991 -0.283558467 0.04507872
## dis -0.11608657 -0.265842863 0.19637862
## rad 3.75646086 0.891226353 -0.05791582
## tax -0.03289022 0.018142262 0.60037939
## ptratio 0.19712918 0.012941619 -0.35889849
## black -0.16862906 0.042012922 0.19552760
## lstat 0.18158897 -0.222769974 0.27628704
## medv 0.21466654 -0.379543496 -0.24809354
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9575 0.0309 0.0116
# Function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
tab <- table(correct = correct_classes, predicted = lda.pred$class)
tab
## predicted
## correct low med_low med_high high
## low 15 15 1 0
## med_low 4 9 5 0
## med_high 0 11 15 2
## high 0 0 1 24
pred1 <- rbind(tab[1, ]/sum(tab[1, ]), tab[2, ]/sum(tab[2, ]))
pred2 <- rbind(tab[3, ]/sum(tab[3, ]), tab[4, ]/sum(tab[4, ]))
Predict_accuracy <- rbind(pred1, pred2)
rownames(Predict_accuracy) <- colnames(Predict_accuracy)
Predict_accuracy
## low med_low med_high high
## low 0.4838710 0.4838710 0.03225806 0.00000000
## med_low 0.2222222 0.5000000 0.27777778 0.00000000
## med_high 0.0000000 0.3928571 0.53571429 0.07142857
## high 0.0000000 0.0000000 0.04000000 0.96000000
require(caret)
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following objects are masked from 'package:InformationValue':
##
## confusionMatrix, precision, sensitivity, specificity
confusionMatrix(correct_classes, lda.pred$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low med_low med_high high
## low 15 15 1 0
## med_low 4 9 5 0
## med_high 0 11 15 2
## high 0 0 1 24
##
## Overall Statistics
##
## Accuracy : 0.6176
## 95% CI : (0.5161, 0.7121)
## No Information Rate : 0.3431
## P-Value [Acc > NIR] : 1.429e-08
##
## Kappa : 0.4977
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: low Class: med_low Class: med_high Class: high
## Sensitivity 0.7895 0.25714 0.6818 0.9231
## Specificity 0.8072 0.86567 0.8375 0.9868
## Pos Pred Value 0.4839 0.50000 0.5357 0.9600
## Neg Pred Value 0.9437 0.69048 0.9054 0.9740
## Prevalence 0.1863 0.34314 0.2157 0.2549
## Detection Rate 0.1471 0.08824 0.1471 0.2353
## Detection Prevalence 0.3039 0.17647 0.2745 0.2451
## Balanced Accuracy 0.7984 0.56141 0.7597 0.9550
# Euclidean distance matrix
boston_scaled <- dplyr::select(boston_scaled, -crime)
dist_eu <- dist(boston_scaled)
# Summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.3984 4.7296 4.7597 6.0150 12.5315
# Manhattan distance matrix
dist_man <- dist(boston_scaled, method = 'manhattan')
# Summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2645 8.2073 12.0993 12.8752 16.8027 43.5452
# k-means clustering with 4
km4 <-kmeans(boston_scaled, centers = 4)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km4$cluster)
# k-means clustering with 3
km3 <-kmeans(boston_scaled, centers = 3)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km3$cluster)
set.seed(100)
# Compute and plot cluster addition & variance explained for k = 2 to k = 15.
k.max <- 15
data <- boston_scaled
clust_TSS <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
clust_TSS
## [1] 6565.000 4207.600 3519.743 3098.744 2746.623 2399.515 2143.440
## [8] 1955.816 1832.813 1736.480 1637.171 1561.280 1498.560 1464.093
## [15] 1385.043
plot(1:k.max, clust_TSS,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
# k-means clustering with 4
km_bonus <-kmeans(boston_scaled, centers = 4)
# Linear discriminant analysis with clusters from k-means as target
mat <- as.matrix(km_bonus$cluster)
cluster_target <- mat[ rownames(mat) %in% rownames(train), ]
lda.fit.bonus <- lda(cluster_target ~., data = train)
# target classes as numeric
classes <- as.numeric(cluster_target)
# Plot the lda results
plot(lda.fit.bonus, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
Run the code below for the (scaled) train data that you used to fit the LDA. The code creates a matrix product, which is a projection of the data points.
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
# Plot with crime class in train
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = train$crime)
# Plot with k-means clusters
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = cluster_target)
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
library("stringr")
library("tidyverse")
## ── Attaching packages ──────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ purrr 0.2.5
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ plotly::select() masks MASS::select(), dplyr::select()
## ✖ purrr::transpose() masks data.table::transpose()
library("GGally")
library("ggplot2")
library("MASS")
library("corrplot")
library("plotly")
library("purrr")
library("devtools")
library("ggbiplot")
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library("grid")
library("lattice")
library("FactoMineR")
analysis.data <- read.table("~/Documents/GitHub/IODS-project/data/humans2.txt",
header=TRUE)
glimpse(analysis.data)
## Observations: 155
## Variables: 8
## $ Life.expectancy <dbl> 81.6, 82.4, 83.0, 80.2, 81.6, 80.9, 80.9, 79...
## $ Exp.school.years <dbl> 17.5, 20.2, 15.8, 18.7, 17.9, 16.5, 18.6, 16...
## $ GNIncome <int> 64992, 42261, 56431, 44025, 45435, 43919, 39...
## $ Mat.Mort.Rate <int> 4, 6, 6, 5, 6, 7, 9, 28, 11, 8, 6, 4, 8, 4, ...
## $ Adol.Birth.Rate <dbl> 7.8, 12.1, 1.9, 5.1, 6.2, 3.8, 8.2, 31.0, 14...
## $ Rep.Parliament... <dbl> 39.6, 30.5, 28.5, 38.0, 36.9, 36.9, 19.9, 19...
## $ Edu.F2M <dbl> 1.0072389, 0.9968288, 0.9834369, 0.9886128, ...
## $ Lab.F2M <dbl> 0.8908297, 0.8189415, 0.8251001, 0.8840361, ...
# General summary of the dataset
summary(analysis.data)
## Life.expectancy Exp.school.years GNIncome Mat.Mort.Rate
## Min. :49.00 Min. : 5.40 Min. : 581 Min. : 1.0
## 1st Qu.:66.30 1st Qu.:11.25 1st Qu.: 4198 1st Qu.: 11.5
## Median :74.20 Median :13.50 Median : 12040 Median : 49.0
## Mean :71.65 Mean :13.18 Mean : 17628 Mean : 149.1
## 3rd Qu.:77.25 3rd Qu.:15.20 3rd Qu.: 24512 3rd Qu.: 190.0
## Max. :83.50 Max. :20.20 Max. :123124 Max. :1100.0
## Adol.Birth.Rate Rep.Parliament... Edu.F2M Lab.F2M
## Min. : 0.60 Min. : 0.00 Min. :0.1717 Min. :0.1857
## 1st Qu.: 12.65 1st Qu.:12.40 1st Qu.:0.7264 1st Qu.:0.5984
## Median : 33.60 Median :19.30 Median :0.9375 Median :0.7535
## Mean : 47.16 Mean :20.91 Mean :0.8529 Mean :0.7074
## 3rd Qu.: 71.95 3rd Qu.:27.95 3rd Qu.:0.9968 3rd Qu.:0.8535
## Max. :204.80 Max. :57.50 Max. :1.4967 Max. :1.0380
# Matrix of the variables
pairs(analysis.data)
# Correlation matrix
cor_matrix <- cor(analysis.data) %>% round(2)
corrplot(cor_matrix, method = "circle", type = "upper",
cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
# Visualise distributions of the variables
analysis.data %>% keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) + facet_wrap(~ key, scales = "free") + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# PCA
pca_human <- prcomp(analysis.data)
# Draw a biplot of the principal component representation and the original variables
# Using the ggbiplot package instead of normal biplot
ggbiplot(pca_human, choices=c(1,2), ellipse = TRUE, labels = rownames(analysis.data)) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
# Standardize dataset
human_std <- scale(analysis.data)
# PCA
pca_human_std <- prcomp(human_std)
# Draw a biplot of the principal component representation and the original variables
# Using the ggbiplot package instead of normal biplot
ggbiplot(pca_human_std, choices=c(1,2), ellipse = TRUE, labels = rownames(analysis.data)) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
pca_human
## Standard deviations (1, .., p=8):
## [1] 1.854416e+04 1.855219e+02 2.518701e+01 1.145441e+01 3.766241e+00
## [6] 1.565912e+00 1.912052e-01 1.591112e-01
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## Life.expectancy -2.815823e-04 -0.0283150248 -1.294971e-02 6.752684e-02
## Exp.school.years -9.562910e-05 -0.0075529759 -1.427664e-02 3.313505e-02
## GNIncome -9.999832e-01 0.0057723054 5.156742e-04 -4.932889e-05
## Mat.Mort.Rate 5.655734e-03 0.9916320120 -1.260302e-01 6.100534e-03
## Adol.Birth.Rate 1.233961e-03 0.1255502723 9.918113e-01 -5.301595e-03
## Rep.Parliament... -5.526460e-05 -0.0032317269 7.398331e-03 9.971232e-01
## Edu.F2M -5.607472e-06 -0.0006713951 3.412027e-05 2.736326e-04
## Lab.F2M 2.331945e-07 0.0002819357 -5.302884e-04 4.692578e-03
## PC5 PC6 PC7 PC8
## Life.expectancy 0.9865644425 1.453515e-01 -5.380452e-03 2.281723e-03
## Exp.school.years 0.1431180282 -9.882477e-01 3.826887e-02 7.776451e-03
## GNIncome -0.0001135863 2.711698e-05 8.075191e-07 -1.176762e-06
## Mat.Mort.Rate 0.0266373214 -1.695203e-03 -1.355518e-04 8.371934e-04
## Adol.Birth.Rate 0.0188618600 -1.273198e-02 8.641234e-05 -1.707885e-04
## Rep.Parliament... -0.0716401914 2.309896e-02 2.642548e-03 2.680113e-03
## Edu.F2M -0.0022935252 -2.180183e-02 -6.998623e-01 7.139410e-01
## Lab.F2M 0.0022190154 -3.264423e-02 -7.132267e-01 -7.001533e-01
summary(pca_human)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.854e+04 185.5219 25.19 11.45 3.766 1.566 0.1912
## Proportion of Variance 9.999e-01 0.0001 0.00 0.00 0.000 0.000 0.0000
## Cumulative Proportion 9.999e-01 1.0000 1.00 1.00 1.000 1.000 1.0000
## PC8
## Standard deviation 0.1591
## Proportion of Variance 0.0000
## Cumulative Proportion 1.0000
# Compute standard deviation of each principal component
std_dev <- pca_human$sdev
# Compute variance
pr_var <- std_dev^2
# Proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
# Scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
# Cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
# Plot each variables coefficients inside a unit circle to get insight on a possible interpretation for PCs
theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
loadings <- data.frame(pca_human$rotation,
.names = row.names(pca_human$rotation))
p + geom_text(data=loadings,
mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
coord_fixed(ratio=1) +
labs(x = "PC1", y = "PC2")
pca_human_std
## Standard deviations (1, .., p=8):
## [1] 2.0708380 1.1397204 0.8750485 0.7788630 0.6619563 0.5363061 0.4589994
## [8] 0.3222406
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## Life.expectancy -0.44372240 0.02530473 -0.10991305 0.05834819
## Exp.school.years -0.42766720 -0.13940571 0.07340270 0.07020294
## GNIncome -0.35048295 -0.05060876 0.20168779 0.72727675
## Mat.Mort.Rate 0.43697098 -0.14508727 0.12522539 0.25170614
## Adol.Birth.Rate 0.41126010 -0.07708468 -0.01968243 -0.04986763
## Rep.Parliament... -0.08438558 -0.65136866 -0.72506309 -0.01396293
## Edu.F2M -0.35664370 -0.03796058 0.24223089 -0.62678110
## Lab.F2M 0.05457785 -0.72432726 0.58428770 -0.06199424
## PC5 PC6 PC7 PC8
## Life.expectancy -0.1628935 0.42242796 -0.43406432 -0.62737008
## Exp.school.years -0.1659678 0.38606919 0.77962966 0.05415984
## GNIncome 0.4950306 -0.11120305 -0.13711838 0.16961173
## Mat.Mort.Rate 0.1800657 -0.17370039 0.35380306 -0.72193946
## Adol.Birth.Rate 0.4672068 0.76056557 -0.06897064 0.14335186
## Rep.Parliament... 0.1523699 -0.13749772 0.00568387 0.02306476
## Edu.F2M 0.5983585 -0.17713316 0.05773644 -0.16459453
## Lab.F2M -0.2625067 0.03500707 -0.22729927 0.07304568
summary(pca_human_std)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0708 1.1397 0.87505 0.77886 0.66196 0.53631
## Proportion of Variance 0.5361 0.1624 0.09571 0.07583 0.05477 0.03595
## Cumulative Proportion 0.5361 0.6984 0.79413 0.86996 0.92473 0.96069
## PC7 PC8
## Standard deviation 0.45900 0.32224
## Proportion of Variance 0.02634 0.01298
## Cumulative Proportion 0.98702 1.00000
# Compute standard deviation of each principal component
std_dev <- pca_human_std$sdev
# Compute variance
pr_var <- std_dev^2
# Proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
# Scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
# Cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
# Plot each variables coefficients inside a unit circle to get insight on a possible interpretation for PCs
theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
loadings <- data.frame(pca_human_std$rotation,
.names = row.names(pca_human_std$rotation))
p + geom_text(data=loadings,
mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
coord_fixed(ratio=1) +
labs(x = "PC1", y = "PC2")
data(tea)
# Select few variables
keep <- c("Tea", "frequency", "sex", "How", "relaxing", "effect.on.health")
tea_select <- dplyr::select(tea, one_of(keep))
# Summary of the subsetted data
glimpse(tea_select)
## Observations: 300
## Variables: 6
## $ Tea <fct> black, black, Earl Grey, Earl Grey, Earl Grey...
## $ frequency <fct> 1/day, 1/day, +2/day, 1/day, +2/day, 1/day, 3...
## $ sex <fct> M, F, F, M, M, M, M, F, M, M, M, M, M, M, M, ...
## $ How <fct> alone, milk, alone, alone, alone, alone, alon...
## $ relaxing <fct> No.relaxing, No.relaxing, relaxing, relaxing,...
## $ effect.on.health <fct> No.effect on health, No.effect on health, No....
# Visualise distributions of the variables
gather(tea_select) %>%
ggplot(aes(value)) +
facet_wrap("key", scales = "free") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
What we see here is that Early Grey is the most consumed kind of Tea,majoriy responding Tea as relaxing however, no effect on health is seen on many, with majority of females amongst the responders and many responders have it alone at a high frequency of +2/day.
tea_mca <- MCA(tea_select, graph = FALSE)
# Summary of MCA
summary(tea_mca)
##
## Call:
## MCA(X = tea_select, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Variance 0.227 0.213 0.187 0.185 0.182 0.174
## % of var. 12.387 11.631 10.218 10.080 9.908 9.468
## Cumulative % of var. 12.387 24.018 34.236 44.316 54.225 63.692
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11
## Variance 0.152 0.148 0.130 0.119 0.116
## % of var. 8.267 8.098 7.092 6.513 6.338
## Cumulative % of var. 71.959 80.057 87.149 93.662 100.000
##
## Individuals (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## 1 | 0.438 0.282 0.126 | 0.224 0.078 0.033 | -0.423 0.318
## 2 | 0.328 0.158 0.056 | 0.282 0.125 0.041 | -0.767 1.046
## 3 | -0.636 0.594 0.603 | -0.342 0.183 0.174 | 0.064 0.007
## 4 | 0.211 0.065 0.048 | -0.085 0.011 0.008 | -0.086 0.013
## 5 | -0.304 0.136 0.115 | -0.019 0.001 0.000 | 0.039 0.003
## 6 | 0.211 0.065 0.048 | -0.085 0.011 0.008 | -0.086 0.013
## 7 | 0.072 0.008 0.003 | -0.197 0.061 0.021 | 0.534 0.508
## 8 | -0.178 0.046 0.013 | 0.854 1.139 0.308 | 0.068 0.008
## 9 | -0.082 0.010 0.005 | 0.362 0.205 0.098 | -0.330 0.194
## 10 | -0.444 0.289 0.162 | 0.500 0.391 0.206 | -0.172 0.053
## cos2
## 1 0.117 |
## 2 0.304 |
## 3 0.006 |
## 4 0.008 |
## 5 0.002 |
## 6 0.008 |
## 7 0.152 |
## 8 0.002 |
## 9 0.082 |
## 10 0.024 |
##
## Categories (the 10 first)
## Dim.1 ctr cos2 v.test Dim.2 ctr cos2
## black | -0.374 2.527 0.046 -3.697 | 1.127 24.502 0.416
## Earl Grey | 0.027 0.034 0.001 0.624 | -0.312 4.895 0.176
## green | 0.681 3.740 0.057 4.138 | -0.703 4.252 0.061
## 1/day | 0.726 12.248 0.244 8.545 | -0.188 0.879 0.016
## 1 to 2/week | 0.330 1.169 0.019 2.362 | 0.811 7.541 0.113
## +2/day | -0.745 17.264 0.408 -11.044 | -0.006 0.001 0.000
## 3 to 6/week | 0.330 0.903 0.014 2.037 | -0.500 2.212 0.032
## F | -0.386 6.489 0.217 -8.063 | -0.364 6.136 0.193
## M | 0.563 9.467 0.217 8.063 | 0.531 8.953 0.193
## alone | -0.100 0.473 0.018 -2.345 | -0.294 4.389 0.160
## v.test Dim.3 ctr cos2 v.test
## black 11.154 | -0.245 1.320 0.020 -2.427 |
## Earl Grey -7.245 | 0.302 5.220 0.165 7.013 |
## green -4.275 | -1.216 14.478 0.183 -7.394 |
## 1/day -2.218 | -0.589 9.763 0.161 -6.929 |
## 1 to 2/week 5.814 | 1.250 20.375 0.268 8.958 |
## +2/day -0.093 | -0.266 2.665 0.052 -3.941 |
## 3 to 6/week -3.089 | 1.021 10.516 0.133 6.313 |
## F -7.598 | 0.027 0.037 0.001 0.557 |
## M 7.598 | -0.039 0.055 0.001 -0.557 |
## alone -6.926 | 0.158 1.436 0.046 3.714 |
##
## Categorical variables (eta2)
## Dim.1 Dim.2 Dim.3
## Tea | 0.086 0.430 0.236 |
## frequency | 0.430 0.136 0.487 |
## sex | 0.217 0.193 0.001 |
## How | 0.201 0.310 0.262 |
## relaxing | 0.259 0.080 0.025 |
## effect.on.health | 0.169 0.130 0.113 |
# Description on the dimension of MCA:
dimdesc(tea_mca)
## $`Dim 1`
## $`Dim 1`$quali
## R2 p.value
## frequency 0.43035283 6.126221e-36
## relaxing 0.25926867 3.411902e-21
## sex 0.21740720 1.341552e-17
## How 0.20055142 2.561497e-14
## effect.on.health 0.16915112 1.127903e-13
## Tea 0.08585463 1.624676e-06
##
## $`Dim 1`$category
## Estimate p.value
## No.relaxing 0.25038745 3.411902e-21
## 1/day 0.26975535 7.022606e-20
## M 0.22617501 1.341552e-17
## effect on health 0.23656752 1.127903e-13
## milk 0.43694935 1.152059e-06
## green 0.27131834 2.810580e-05
## 1 to 2/week 0.08083804 1.790922e-02
## alone 0.13398295 1.876177e-02
## 3 to 6/week 0.08083749 4.143368e-02
## black -0.23108131 1.910251e-04
## other -0.81706844 4.433926e-11
## No.effect on health -0.23656752 1.127903e-13
## F -0.22617501 1.341552e-17
## relaxing -0.25038745 3.411902e-21
## +2/day -0.43143088 8.756742e-36
##
##
## $`Dim 2`
## $`Dim 2`$quali
## R2 p.value
## Tea 0.43049300 4.915996e-37
## How 0.30966049 1.171181e-23
## sex 0.19305357 1.366580e-15
## effect.on.health 0.13023599 1.171471e-10
## frequency 0.13603174 2.070879e-09
## relaxing 0.07990218 6.443703e-07
##
## $`Dim 2`$category
## Estimate p.value
## black 0.50330392 1.085763e-36
## M 0.20652085 1.366580e-15
## milk 0.09529808 1.587371e-12
## effect on health 0.20114095 1.171471e-10
## other 0.67944635 2.015273e-10
## 1 to 2/week 0.36104435 2.310087e-09
## relaxing 0.13468963 6.443703e-07
## 1/day -0.10046877 2.632170e-02
## 3 to 6/week -0.24421119 1.893538e-03
## green -0.34198306 1.479665e-05
## No.relaxing -0.13468963 6.443703e-07
## No.effect on health -0.20114095 1.171471e-10
## alone -0.39248172 5.493201e-13
## Earl Grey -0.16132086 3.485479e-14
## F -0.20652085 1.366580e-15
##
##
## $`Dim 3`
## $`Dim 3`$quali
## R2 p.value
## frequency 0.48690576 1.239113e-42
## How 0.26195167 2.130076e-19
## Tea 0.23624924 4.152609e-18
## effect.on.health 0.11251437 2.535985e-09
## relaxing 0.02536586 5.696963e-03
##
## $`Dim 3`$category
## Estimate p.value
## 1 to 2/week 0.38761770 5.303215e-22
## Earl Grey 0.29800933 2.621465e-13
## 3 to 6/week 0.28878032 6.837716e-11
## effect on health 0.17523761 2.535985e-09
## lemon 0.45654646 7.757408e-08
## alone 0.15038168 1.782129e-04
## relaxing 0.07113249 5.696963e-03
## black 0.06115032 1.499559e-02
## No.relaxing -0.07113249 5.696963e-03
## other -0.34255089 2.707970e-03
## +2/day -0.26836999 6.790217e-05
## No.effect on health -0.17523761 2.535985e-09
## 1/day -0.40802803 5.332058e-13
## milk -0.26437725 8.963243e-14
## green -0.35915965 9.118354e-15
# Plotting MCA
plot(tea_mca, invisible=c("ind"), habillage = "quali")
RATSL <- read.csv("data/RATSL.csv")
BPRSL <- read.csv("data/BPRSL.csv")
BPRS <- read.csv("data/BPRS.csv")
RATSL$ID <- factor(RATSL$ID)
RATSL$Group <- factor(RATSL$Group)
RATSL$WD <- as.character(RATSL$WD)
glimpse(RATSL)
## Observations: 176
## Variables: 5
## $ ID <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1...
## $ WD <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1",...
## $ Weight <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 445, ...
## $ Time <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8...
head(RATSL)
## ID Group WD Weight Time
## 1 1 1 WD1 240 1
## 2 2 1 WD1 225 1
## 3 3 1 WD1 245 1
## 4 4 1 WD1 260 1
## 5 5 1 WD1 255 1
## 6 6 1 WD1 260 1
# Draw the plot
ggplot(RATSL, aes(x = Time, y = Weight, linetype = ID)) +
geom_line() +
scale_linetype_manual(values = rep(1:10, times=4)) +
facet_grid(. ~ Group, labeller = label_both) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(min(RATSL$Weight), max(RATSL$Weight)))
RATSSTD <- RATSL %>%
group_by(Time) %>%
mutate(stdWeight = (Weight - mean(Weight))/sd(Weight) ) %>%
ungroup()
glimpse(RATSSTD)
## Observations: 176
## Variables: 6
## $ ID <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1...
## $ WD <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD...
## $ Weight <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 44...
## $ Time <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8...
## $ stdWeight <dbl> -1.0011429, -1.1203857, -0.9613953, -0.8421525, -0.8...
# Re-Draw the plot above from standardised data
ggplot(RATSSTD, aes(x = Time, y = stdWeight, linetype = ID)) +
geom_line() +
scale_linetype_manual(values = rep(1:10, times=4)) +
facet_grid(. ~ Group, labeller = label_both) +
theme(legend.position = "none") +
scale_y_continuous(name = "standardised RATS")
# Number of weeks, baseline (week 0) included
n <- RATSL$Time %>% unique() %>% length()
# Summary data with mean and standard error of bprs by treatment
# and week
detach(package:ggbiplot)
detach(package:plyr)
RATSS <- RATSL %>%
group_by(Group, Time) %>%
summarise( mean = mean(Weight), se = sd(Weight)/sqrt(n) ) %>%
ungroup()
# Glimpse the data
glimpse(RATSS)
## Observations: 33
## Variables: 4
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,...
## $ Time <int> 1, 8, 15, 22, 29, 36, 43, 44, 50, 57, 64, 1, 8, 15, 22, ...
## $ mean <dbl> 250.625, 255.000, 254.375, 261.875, 264.625, 265.000, 26...
## $ se <dbl> 4.589478, 3.947710, 3.460116, 4.100800, 3.333956, 3.5529...
ggplot(RATSS, aes(x = Time, y = mean, linetype = Group, shape = Group)) +
geom_line() +
scale_linetype_manual(values = c(1,2,3)) +
geom_point(size=3) +
scale_shape_manual(values = c(1,2,3)) +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se, linetype="1"), width = 0.3) +
theme(legend.position = c(0.8,0.35)) +
scale_y_continuous(name = "mean(Weight) +/- se(Weight)")
RATSL.box <- RATSL %>%
filter(Time > 1) %>%
group_by(Group, ID) %>%
summarise( mean=mean(Weight) ) %>%
ungroup()
# Glimpse the data
glimpse(RATSL.box)
## Observations: 16
## Variables: 3
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
## $ ID <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
## $ mean <dbl> 263.2, 238.9, 261.7, 267.2, 270.9, 276.2, 274.6, 267.5, ...
# Draw boxplot
ggplot(RATSL.box, aes(x = Group, y = mean)) +
geom_boxplot() +
stat_summary(fun.y = "mean", geom = "point", shape = 25, size = 4, fill = "white") +
scale_y_continuous(name = "mean(RATS)")
RATSL.box.out <- filter(RATSL.box,
(Group == 1 & mean > 250) |
(Group == 2 & mean < 590) |
(Group == 3 & mean > 500))
ggplot(RATSL.box.out, aes(x = Group, y = mean)) +
geom_boxplot() +
stat_summary(fun.y = "mean", geom = "point", shape=25, size=4, fill = "white") +
scale_y_continuous(name = "mean(RATS)")
anova_rats <- aov(mean ~ Group, data = RATSL.box.out)
summary(anova_rats)
## Df Sum Sq Mean Sq F value Pr(>F)
## Group 2 176917 88458 2836 1.69e-14 ***
## Residuals 10 312 31
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Refactoring
BPRSL$subject <- factor(BPRSL$subject)
BPRSL$treatment <- factor(BPRSL$treatment)
BPRSL$weeks <- as.character(BPRSL$weeks)
BPRS$subject <- factor(BPRS$subject)
BPRS$treatment <- factor(BPRS$treatment)
head(BPRSL)
## treatment subject weeks bprs week
## 1 1 1 week0 42 0
## 2 1 2 week0 58 0
## 3 1 3 week0 54 0
## 4 1 4 week0 55 0
## 5 1 5 week0 72 0
## 6 1 6 week0 48 0
# initialize plot with data and aesthetic mapping
ggplot(BPRSL, aes(x = week, y = bprs, col=treatment)) +
geom_point(size = 3, alpha = .3) +
geom_smooth(method = "lm") +
theme_minimal()
ggplot(BPRSL, aes(x = week, y = bprs, linetype = subject)) +
geom_line(aes(col=treatment)) +
scale_linetype_manual(values = rep(1:10, times=4)) +
facet_grid(. ~ treatment, labeller = label_both) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(min(BPRSL$bprs), max(BPRSL$bprs)))
# create a regression model RATS_reg model
# bprs ~ week + d1
reg.bprs <- lm(bprs ~ week + treatment, data = BPRSL)
# print out a summary of the model
summary(reg.bprs)
##
## Call:
## lm(formula = bprs ~ week + treatment, data = BPRSL)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.454 -8.965 -3.196 7.002 50.244
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.4539 1.3670 33.982 <2e-16 ***
## week -2.2704 0.2524 -8.995 <2e-16 ***
## treatment2 0.5722 1.3034 0.439 0.661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.37 on 357 degrees of freedom
## Multiple R-squared: 0.1851, Adjusted R-squared: 0.1806
## F-statistic: 40.55 on 2 and 357 DF, p-value: < 2.2e-16
# Random intercept model
intercept.bprs <- lmer(bprs ~ week + treatment + (1 | subject),
data = BPRSL, REML = FALSE)
summary(intercept.bprs)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: bprs ~ week + treatment + (1 | subject)
## Data: BPRSL
##
## AIC BIC logLik deviance df.resid
## 2748.7 2768.1 -1369.4 2738.7 355
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0481 -0.6749 -0.1361 0.4813 3.4855
##
## Random effects:
## Groups Name Variance Std.Dev.
## subject (Intercept) 47.41 6.885
## Residual 104.21 10.208
## Number of obs: 360, groups: subject, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 46.4539 1.9090 24.334
## week -2.2704 0.2084 -10.896
## treatment2 0.5722 1.0761 0.532
##
## Correlation of Fixed Effects:
## (Intr) week
## week -0.437
## treatment2 -0.282 0.000
# Random intercept + random slope model
mix.reg.bprs <- lmer(bprs ~ week + treatment + (week | subject),
data = BPRSL, REML = FALSE)
# Summary of the model
summary(mix.reg.bprs)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: bprs ~ week + treatment + (week | subject)
## Data: BPRSL
##
## AIC BIC logLik deviance df.resid
## 2745.4 2772.6 -1365.7 2731.4 353
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8919 -0.6194 -0.0691 0.5531 3.7976
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## subject (Intercept) 64.8202 8.0511
## week 0.9608 0.9802 -0.51
## Residual 97.4307 9.8707
## Number of obs: 360, groups: subject, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 46.4539 2.1052 22.066
## week -2.2704 0.2977 -7.626
## treatment2 0.5722 1.0405 0.550
##
## Correlation of Fixed Effects:
## (Intr) week
## week -0.582
## treatment2 -0.247 0.000
# ANOVA test
anova(mix.reg.bprs, intercept.bprs)
## Data: BPRSL
## Models:
## intercept.bprs: bprs ~ week + treatment + (1 | subject)
## mix.reg.bprs: bprs ~ week + treatment + (week | subject)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## intercept.bprs 5 2748.7 2768.1 -1369.4 2738.7
## mix.reg.bprs 7 2745.4 2772.6 -1365.7 2731.4 7.2721 2 0.02636
##
## intercept.bprs
## mix.reg.bprs *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model with interaction between time and treatment
interaction.bprs <- lmer(bprs ~ week * treatment + (week | subject),
data = BPRSL, REML = FALSE)
# Summary of the model
summary(interaction.bprs)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: bprs ~ week * treatment + (week | subject)
## Data: BPRSL
##
## AIC BIC logLik deviance df.resid
## 2744.3 2775.4 -1364.1 2728.3 352
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0512 -0.6271 -0.0767 0.5288 3.9260
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## subject (Intercept) 65.0016 8.0624
## week 0.9688 0.9843 -0.51
## Residual 96.4699 9.8219
## Number of obs: 360, groups: subject, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 47.8856 2.2522 21.262
## week -2.6283 0.3589 -7.323
## treatment2 -2.2911 1.9090 -1.200
## week:treatment2 0.7158 0.4010 1.785
##
## Correlation of Fixed Effects:
## (Intr) week trtmn2
## week -0.650
## treatment2 -0.424 0.469
## wek:trtmnt2 0.356 -0.559 -0.840
# ANOVA test
anova(interaction.bprs, mix.reg.bprs)
## Data: BPRSL
## Models:
## mix.reg.bprs: bprs ~ week + treatment + (week | subject)
## interaction.bprs: bprs ~ week * treatment + (week | subject)
## Df AIC BIC logLik deviance Chisq Chi Df
## mix.reg.bprs 7 2745.4 2772.6 -1365.7 2731.4
## interaction.bprs 8 2744.3 2775.4 -1364.1 2728.3 3.1712 1
## Pr(>Chisq)
## mix.reg.bprs
## interaction.bprs 0.07495 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create a vector of the fitted values
Fitted <- fitted(interaction.bprs)
# Create a new column fitted to RATSL
BPRSL <- BPRSL %>%
mutate(Fitted)
#Fitted plot
ggplot(BPRSL, aes(x = week, y = Fitted, linetype = subject)) +
geom_line(aes(col=treatment)) +
scale_linetype_manual(values = rep(1:10, times=4)) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(min(BPRSL$Fitted), max(BPRSL$bprs))) +
ggtitle("Fitted")
#Observed plot
ggplot(BPRSL, aes(x = week, y = bprs, linetype = subject)) +
geom_line(aes(col=treatment)) +
scale_linetype_manual(values = rep(1:10, times=4)) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(min(BPRSL$Fitted), max(BPRSL$bprs))) +
ggtitle("Observed")